from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

import pandas as pd
from sklearn.model_selection import train_test_split
import time

data = pd.read_csv("titanic.csv",
                   index_col=0)
X = data[["age", "sex", "pclass"]].copy()
y = data["survived"]  # 1 活  0 死

# 数据清洗
# 年龄为空 填充一个 均值/众数/中位
X.fillna({"age": X["age"].median()}, inplace=True)
# 将特征非数值变成数值
# 舱位转换为数值
X["pclass"] = X["pclass"].str[0]
X["pclass"] = X["pclass"].astype(int)

# 性别有不同的取值
# print(X["sex"].unique())

X["sex"] = (X["sex"] == 'male').astype(int)

# print("特征\n", X)
feature_names = X.columns

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state=1,
    test_size=0.2,
    stratify=y
)

# n_estimators 多少棵树，默认是100
rf = RandomForestClassifier(min_samples_split=50, min_samples_leaf=10)
# rf = DecisionTreeClassifier()

start = time.time()
rf.fit(X_train, y_train)

print("准确率", rf.score(X_test, y_test))
print("总时间", time.time() - start)

# 随机森林的可视化---查看某一个树
export_graphviz(rf[2], "rf_2.dot",
                feature_names=feature_names,
                class_names=["死", '生']
                )

"""
树的深度、随机森林的树的数量，参数的最优选择 可以通过交叉验证
"""

"""
随机森林时间：0.3253951072692871
决策树：0.02045416831970215
"""
